package com.skz.job.spider;

import com.skz.job.domain.Job;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

import java.util.List;

/**
 * @author: 宋开宗
 * @create: 2019-02-18 22:55
 **/
public class BossZhiPinSpider implements PageProcessor {
    private static final String JOB_DETAIL_MATCH = "job_detail";
    private static final String CAPTCHA_TEXT = "为了您的账号安全，我们需要在执行操作之前验证您的身份，请输入验证码。";
    private String listUrl;
    private int listStartPage;
    private String lastValidUrl;
    private static final int MAX_PAGE = 12;
    private Logger logger = LoggerFactory.getLogger(this.getClass());

    public BossZhiPinSpider(String listUrl, int listStartPage) {
        this.listUrl = listUrl;
        this.listStartPage = listStartPage;
    }

    @Override
    public void process(Page page) {
        //经过测试大概在某个时间段内（3分钟？5分钟）每请求100次左右就会出现验证码
        //可以选择破解验证码，使用代理ip，人工访问页面手动打码
        //经测试，用tess4j识别不准确，免费代理ip不稳定，
        if (page.getHtml().get().contains(CAPTCHA_TEXT)) {
            //把出现验证码的页面扔回到抓取队列中
            page.addTargetRequest(lastValidUrl);
            logger.info("出现验证码，开始睡眠");
            //这里暂时人工访问任意detail页面打码，开启10个线程，抓取220条数据，大概总耗时3-4分钟。
            try {
                Thread.sleep(1000 * 10);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            logger.info("睡眠结束");
            return;
        }
        if (page.getUrl().get().contains(JOB_DETAIL_MATCH)) {
            String companyName = page.getHtml().xpath("//*[@id=\"main\"]/div[3]/div/div[1]/div[2]/div/a[2]/text()").get();
            String trade = page.getHtml().xpath("//*[@id=\"main\"]/div[3]/div/div[1]/div[2]/p[4]/a/text()").get();
            String description = page.getHtml().xpath("//*[@id=\"main\"]/div[3]/div/div[2]/div[2]/div[1]/div/text()").get();
            String location = page.getHtml().xpath("//*[@id=\"main\"]/div[3]/div/div[2]/div[2]/div[6]/div/div[2]/img/@src").get();
            if (location == null) {
                location = page.getHtml().xpath("//*[@id=\"main\"]/div[3]/div/div[2]/div[2]/div[5]/div/div[2]/img/@src").get();
            }
            if (location == null) {
                location = page.getHtml().xpath("//*[@id=\"main\"]/div[3]/div/div[2]/div[2]/div[4]/div/div[2]/img/@src").get();
            }
            String address = page.getHtml().xpath("//*[@id=\"main\"]/div[3]/div/div[2]/div[2]/div[6]/div/div[1]/text()").get();
            if (address == null) {
                address = page.getHtml().xpath("//*[@id=\"main\"]/div[3]/div/div[2]/div[2]/div[5]/div/div[1]/text()").get();
            }
            if (address == null) {
                address = page.getHtml().xpath("//*[@id=\"main\"]/div[3]/div/div[2]/div[2]/div[4]/div/div[1]/text()").get();
            }
            //经纬度
            String[] yx = location.substring(location.indexOf("A:") + 2, location.lastIndexOf("&key")).split(",");
            Job job = new Job(companyName, trade, description, yx[1] + "," + yx[0], "", "", address, page.getUrl().get(), "boss", 3);
            page.putField("job", job);
            try {
                Thread.sleep((long) (2000 + Math.random() * 1000));
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            //记录最后一次抓取到的有效的页面
            lastValidUrl = page.getUrl().get();
        } else {
            List<String> links = page.getHtml().css("#main > div > div.job-list > ul").links().all();
            if (links.size() != 0 && listStartPage < MAX_PAGE) {
                links.forEach(s -> {
                    if (s.contains(JOB_DETAIL_MATCH)) {
                        page.addTargetRequest(s);
                    }
                });
                listStartPage++;
                page.addTargetRequest(listUrl + listStartPage);
            }
        }
    }

    @Override
    public Site getSite() {
        return Site.me().setRetryTimes(3);
    }

}